QualityTestTool.java example

Explorer
damp.ekeko.snippets-master
- damp.ekeko.snippets.plugin
  - src
    - damp
      - ekeko
        snippets
        BoundDirective.java
        DirectiveOperandBinding.java
        EkekoSnippetsPlugin.java
        ExtractedSnippet.java
        NaiveASTFlattener.java
        OperatorOperandBinding.java
        SnippetBaseListener.java
        SnippetBaseVisitor.java
        SnippetExtractor.java
        SnippetLexer.java
        SnippetListener.java
        SnippetParser.java
        SnippetVisitor.java
        data
        SnippetOperator.java
        TemplateGroup.java
        geneticsearch
        PartialJavaProjectModel.java
        gui
        BoundDirectivesEditorDialog.java
        BoundDirectivesViewer.java
        ChartCanvas.java
        ClojureFileEditorInput.java
        DirectiveOperandBindingEditingSupport.java
        DirectiveOperandBindingLabelProviderValue.java
        DirectiveSelectionDialog.java
        IntendedResultsEditor.java
        IntendedResultsEditorCommandHandler.java
        IntendedResultsEditorInput.java
        IntendedResultsEditorPersistableElementFactory.java
        MutationHistoryDialog.java
        OperandBindingLabelProviderDescription.java
        OperatorOperandBindingEditingSupport.java
        OperatorOperandBindingLabelProviderValue.java
        OperatorOperandsView.java
        OperatorOperandsViewer.java
        OperatorTreeContentProvider.java
        OperatorTreeLabelProvider.java
        PopulationInspectorDialog.java
        QueryInspectorDialog.java
        RecommendationEditor.java
        RecommendationEditorCommandHandler.java
        RecommendationEditorInput.java
        RecommendationEditorPersistableElementFactory.java
        RewritesTemplateEditor.java
        SubjectsTemplateEditor.java
        TemplateCodeGenerator.java
        TemplateEditor.java
        TemplateEditorActionBarContributor.java
        TemplateEditorCommandHandler.java
        TemplateEditorInput.java
        TemplateEditorPersistableElementFactory.java
        TemplateGroupNodeSelectionDialog.java
        TemplateGroupTemplateElement.java
        TemplateGroupViewer.java
        TemplateGroupViewerNodeDoubleClickListener.java
        TemplateGroupViewerNodeSelectionEvent.java
        TemplateGroupViewerNodeSelectionListener.java
        TemplatePrettyPrinter.java
        TemplateTreeContentProvider.java
        TemplateTreeLabelProviders.java
        TransformationEditor.java
        TransformationEditorActionBarContributor.java
        TransformationEditorCommandHandler.java
        TransformationEditorInput.java
        TransformationEditorPersistableElementFactory.java
        TransformationOverviewEditor.java
    - ec
      - util
        MersenneTwister.java
- damp.ekeko.snippets.plugin.test
  - resources
  - src
    - test
      - damp
        ekeko
        snippets
        EkekoSnippetsTest.java
        experiments
        GeneticSearchTest.java
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.quality;

import java.io.*;
import java.util.*;
import java.util.logging.*;

import net.nutch.util.*;
import net.nutch.quality.dynamic.*;

/***************************************************
 * The QualityTestTool runs a bunch of tests
 * against both Nutch and external search engines.
 * The inputs for the actual quality metric computation
 * can be precomputed using ResultSetGenerator and 
 * URLInsetGenerator.  OR, QualityTestTool can run
 * those programs for you.
 *
 * Whether the values have been computed before, or
 * whether you ask QualityTestTool to do it, they
 * remain where they are.  We do not delete the results
 * of the test, in case the user wants to run computations
 * again without all the work of building the test
 * material.
 *
 * We compute two metrics that are interesting.  The
 * first compares page-coverage among the engines.
 * The second compares ranking-quality.  We emit
 * statistics for both.
 *
 * @author Mike Cafarella
 **************************************************/
public class QualityTestTool {
    final static String UNIQUE_URLS = "uniqueURLs.bin";
    final static String QUERY_LIST = "queryList.txt";
    final static String URL_INSET_SUFFIX = ".urlInset";
    final static String QUERY_RESULTS_SUFFIX = ".queryResults";
    final static String ENGINE_DESC_SUFFIX = ".src";
    final static String NUTCH_LABEL = "Nutch";
    public static final Logger LOG = LogFormatter.getLogger("net.nutch.quality.QualityTestTool");

    File inputsDir;
    TreeMap engineInsetData = null;

    /**
     * The inputs are given to us.
     */
    public QualityTestTool(File inputsDir) {
        this.inputsDir = inputsDir;
    }

    /**
     * We need to compute all the inputs ourselves.
     */
    public QualityTestTool(File externalEngines, String userAgent, String segmentsDirs[], File queryList) throws IOException, ParseException {
        this.inputsDir = new File("localInputs");
        if (inputsDir.exists()) {
            throw new IOException("Cannot run QualityTestTool.  File " + inputsDir + " already exists");
        }
        inputsDir.mkdirs();

        createInputs(externalEngines, userAgent, queryList, segmentsDirs);
    }

    /**
     * Given a directory full of search engine descriptions, 
     * a directory of Nutch segments, and a list of queries,
     * we need to build the necessary files for a Quality Test.
     *
     * Everything is placed in the directory at "inputsDir".
     */
    private void createInputs(File externalEngines, String userAgent, File queryList, String segmentsDirs[]) throws IOException, ParseException {
        //
        // 1st, just copy the query list
        //
        LOG.info("CreateInputs, 1 of 6:  Copying query list...");
        File targetQueryList = new File(inputsDir, QUERY_LIST);
        FileUtil.copyContents(queryList, targetQueryList, true);

        //
        // 2nd, test the queryList against all the remote 
        // search engines.  Use ResultSetGenerator for this.
        //
        LOG.info("CreateInputs, 2 of 6:  Testing queries against remote engines...");
        File engineDescs[] = externalEngines.listFiles();
        for (int i = 0; i < engineDescs.length; i++) {
            String filename = engineDescs[i].getName();
            if (filename.endsWith(ENGINE_DESC_SUFFIX)) {
                // Compute the engine name
                int suffixStart = filename.lastIndexOf(ENGINE_DESC_SUFFIX);
                String engineName = filename.substring(0, suffixStart);

                PageExtractor.IExtractor extractor = new PageExtractor.RemotePageExtractor(engineDescs[i], userAgent, false);
                ResultSetGenerator rsg = new ResultSetGenerator(extractor, false);
                rsg.processQueries(targetQueryList, new File(inputsDir, engineName + QUERY_RESULTS_SUFFIX));
            }
        }


        //
        // 3rd, test the queryList against the Nutch segments.
        //
        LOG.info("CreateInputs, 3 of 6:  Testing queries against local Nutch segments...");
        for (int i = 0; i < segmentsDirs.length; i++) {
            PageExtractor.IExtractor extractor = new PageExtractor.NutchExtractor(segmentsDirs[i]);
            ResultSetGenerator rsg = new ResultSetGenerator(extractor, false);
            rsg.processQueries(targetQueryList, new File(inputsDir, NUTCH_LABEL + "." + i + QUERY_RESULTS_SUFFIX));
        }
        
        
        //
        // 4th, now that we have all the queryResults, we 
        // compute the uniquified URL list.  This is written
        // to a file in inputsDir.
        //
        // Note we may have a value "minSupport".  If the percentage
        // of engines that include a given term in the top-10 list
        // is >= minSupport, then it is included in the uniquified
        // list.  Otherwise, the term doesn't make it.  This approach
        // lets us remove wholly "idiosyncratic" URLs from the test
        // pool.
        //
        // Of course, if minSupport is 0 then all items will pass 
        // the guard.
        //
        LOG.info("CreateInputs, 4 of 6:  Computing unique URL set...");
        TreeMap returnedURLSets = new TreeMap();
        SortedMap uniqueMap = new TreeMap();

        File resultLists[] = inputsDir.listFiles();
        for (int i = 0; i < resultLists.length; i++) {
            String filename = resultLists[i].getName();
            if (filename.endsWith(QUERY_RESULTS_SUFFIX)) {
                // Compute the engine name
                int suffixStart = filename.lastIndexOf(QUERY_RESULTS_SUFFIX);
                String engineName = filename.substring(0, suffixStart);

                // Store all the URLs returned by this engine
                SortedSet returnedURLSet = new TreeSet();
                returnedURLSets.put(engineName, returnedURLSet);

                // Load in the engine's result set
                DataInputStream in = new DataInputStream(new FileInputStream(resultLists[i]));
                try {
                    int numQueries = in.readInt();
                    for (int j = 0; j < numQueries; j++) {
                        String query = in.readUTF();
                        int numResults = in.readInt();
                        for (int k = 0; k < numResults; k++) {
                            String foundURL = in.readUTF();
                            // Remember all the URLs from this engine
                            returnedURLSet.add(foundURL);
                        }
                    }
                } finally {
                    in.close();
                }
            }
        }

        //
        // Figure out whether each term enjoys enough support
        // to make it into the unique set.
        //
        // Go through all known sets...
        for (Iterator it = returnedURLSets.values().iterator(); it.hasNext(); ) {
            SortedSet curSet = (SortedSet) it.next();

            // And iterate through the terms of each set...
            for (Iterator it2 = curSet.iterator(); it2.hasNext(); ) {
                String term = (String) it2.next();

                // Testing each term to make sure it is common enough...
                int containsCount = 0;
                for (Iterator it3 = returnedURLSets.values().iterator(); it3.hasNext(); ) {
                    SortedSet testSet = (SortedSet) it3.next();
                    if (testSet.contains(term)) {
                        containsCount++;
                    }
                }

                // Before inserting the term into the uniquified pool.
                uniqueMap.put(term, new Integer(containsCount));
            }
        }

        // Now write out the unique URL set
        File uniqueURLs = new File(inputsDir, UNIQUE_URLS);
        DataOutputStream out = new DataOutputStream(new FileOutputStream(uniqueURLs));
        try {
            out.writeInt(uniqueMap.size());
            for (Iterator it = uniqueMap.keySet().iterator(); it.hasNext(); ) {
                String url = (String) it.next();
                Integer count = (Integer) uniqueMap.get(url);
                out.writeUTF(url);
                out.writeInt(count.intValue());
            }
        } finally {
            out.close();
        }

        //
        // 5th, we test each remote search engine to see
        // if it contains each unique URL.  We write the
        // results of each test to inputsDir.  Use URLInsetTester
        // for this.
        //
        LOG.info("CreateInputs, 5 of 6:  Test membership of each URL in every remote engine...");
        for (int i = 0; i < engineDescs.length; i++) {
            String filename = engineDescs[i].getName();
            if (filename.endsWith(ENGINE_DESC_SUFFIX)) {
                // Compute the engine name
                int suffixStart = filename.lastIndexOf(ENGINE_DESC_SUFFIX);
                String engineName = filename.substring(0, suffixStart);

                // Test the URLs to see if they are in-set
                PageExtractor.RemotePageExtractor extractor = new PageExtractor.RemotePageExtractor(engineDescs[i], userAgent, false);
                URLInsetTester uit = new URLInsetTester(extractor, false);
                uit.testURLs(uniqueURLs, (TreeSet) returnedURLSets.get(engineName), new File(inputsDir, engineName + URL_INSET_SUFFIX));
            }
        }

        //
        // 6th, we test Nutch to see if it contains each
        // unique URL.  Write the results to inputsDir.
        //
        LOG.info("CreateInputs, 6 of 6:  Test membership of each URL in local Nutch segments...");
        for (int i = 0; i < segmentsDirs.length; i++) {
            PageExtractor.NutchExtractor extractor = new PageExtractor.NutchExtractor(segmentsDirs[i]);
            URLInsetTester uit = new URLInsetTester(extractor, false);
            uit.testURLs(uniqueURLs, (TreeSet) returnedURLSets.get(NUTCH_LABEL + "." + i), new File(inputsDir, NUTCH_LABEL + "." + i + URL_INSET_SUFFIX));
        }
    }

    /**
     * This assumes we have a directory full of all the information
     * we need.  We look in inputsDir for files of this format:
     *
     *  "queryList.txt"
     *  "uniqueURLs.txt"
     *  "searchEngineNameA.queryResults"
     *  "searchEngineNameA.urlInset"
     *  "searchEngineNameB.queryResults"
     *  "searchEngineNameB.urlInset"
     *  ...
     *
     * We assume that queryList.txt has only a few hundred
     * items in it, tops.  If that assumption doesn't hold,
     * then this code might not be efficent enough.
     */
    public void runTests(boolean testCoverage, boolean testOrdering, double coverageConsensus) throws IOException {
        //
        // Part I.  Compute the coverage numbers.
        //
        if (testCoverage) {
            computeCoverageScore(coverageConsensus);

            System.out.println();
            System.out.println();
        }

        //
        // Part I.5.  Compute the 'top-10 eccentric' score
        //
        if (testCoverage && (coverageConsensus > 0.0)) {
            computeEccentricScore(coverageConsensus);
            System.out.println();
            System.out.println();
        }

        //
        // Part II.  Compute the ordering scores.
        //
        //
        if (testOrdering) {
            computeOrderingScore();
        }
    }

    /**
     * Compute page-coverage over all the engines.
     * Uses information stored in "inputsDir".
     */
    private void computeCoverageScore(double coverageConsensus) throws IOException {
        //
        // 1.  Figure out how many engines we're testing, and how
        // many times a term needs to appear to satisfy "coverageConsensus"
        //
        int numEngines = 0;
        File contents[] = inputsDir.listFiles();
        for (int i = 0; i < contents.length; i++) {
            String filename = contents[i].getName();
            if (filename.endsWith(URL_INSET_SUFFIX)) {
                // Compute the engine name
                numEngines++;
            }
        }
        int requiredCount = (int) Math.ceil(numEngines * coverageConsensus);
        System.out.println("URL must be present in at least " + requiredCount + " (" + coverageConsensus + ") item(s)");

        //
        // 2.  Load in the complete uniquified URL list, along
        //    with counts of how many engines have the URL.
        //    Don't include terms that fail to satisfy coverageConsensus
        //
        TreeMap uniqueURLs = new TreeMap();
        DataInputStream in = new DataInputStream(new FileInputStream(new File(inputsDir, UNIQUE_URLS)));
        try {
            int numItems = in.readInt();
            for (int i = 0; i < numItems; i++) {
                String url = in.readUTF();
                int count = in.readInt();
                if (count >= requiredCount) {
                    uniqueURLs.put(url, new Integer(count));
                }
            }
        } finally {
            in.close();
        }

        //
        // 3.  Go through each engine and load in the list
        // of inset-URLs.
        //
        TreeMap urlInsetScores = new TreeMap();
        int maxInsetScore = uniqueURLs.size();
        for (int i = 0; i < contents.length; i++) {
            String filename = contents[i].getName();
            if (filename.endsWith(URL_INSET_SUFFIX)) {
                // Compute the engine name
                int suffixStart = filename.lastIndexOf(URL_INSET_SUFFIX);
                String engineName = filename.substring(0, suffixStart);

                // Load in the engine's url-inset list
                int insetScore = 0;
                DataInputStream din = new DataInputStream(new FileInputStream(contents[i]));
                try {
                    int numItems = din.readInt();

                    //
                    // Load in whether each URL was in-set or not.
                    // If it was in-set for the engine, and is in
                    // the qualified unique set overall, then the
                    // engine gets a point.
                    //
                    for (int j = 0; j < numItems; j++) {
                        String url = din.readUTF();
                        if (din.readBoolean() && uniqueURLs.get(url) != null) {
                            insetScore++;
                        }
                    }
                } finally {
                    din.close();
                }
                
                // When done processing this file, store the score
                urlInsetScores.put(engineName, new Integer(insetScore));
            }
        }

        //
        // Third, output the coverage statistics
        //
        System.out.println("Engine\t\tCoverage Score");
        System.out.println("--------------------------------");
        for (Iterator it = urlInsetScores.keySet().iterator(); it.hasNext(); ) {
            String engineName = (String) it.next();
            int score = ((Integer) urlInsetScores.get(engineName)).intValue();

            System.out.println(engineName + "\t\t" + score + " of " + maxInsetScore + "\t(" + ((score / (1.0 * maxInsetScore)) * 100) + "%)");
        }
    }

    /**
     * Figure out how many of an engine's URLs are
     * not "eccentric".  That is, the URL WILL appear
     * in at least "coverageConsensus" percentage of
     * the engines' results.
     */
    private void computeEccentricScore(double coverageConsensus) throws IOException {
        //
        // 1.  Figure out how many engines we're testing, and how
        // many times a term needs to appear to satisfy "coverageConsensus"
        //
        int numEngines = 0;
        File contents[] = inputsDir.listFiles();
        for (int i = 0; i < contents.length; i++) {
            String filename = contents[i].getName();
            if (filename.endsWith(URL_INSET_SUFFIX)) {
                // Compute the engine name
                numEngines++;
            }
        }
        int requiredCount = (int) Math.ceil(numEngines * coverageConsensus);
        System.out.println("URL must be present in at least " + requiredCount + " (" + coverageConsensus + ") item(s)");

        //
        // 2.  Load in the complete uniquified URL list, along
        //    with counts of how many engines have the URL.
        //    Don't include terms that fail to satisfy coverageConsensus
        //
        TreeMap sharedURLs = new TreeMap();
        DataInputStream in = new DataInputStream(new FileInputStream(new File(inputsDir, UNIQUE_URLS)));
        try {
            int numItems = in.readInt();
            for (int i = 0; i < numItems; i++) {
                String url = in.readUTF();
                int count = in.readInt();
                if (count >= requiredCount) {
                    sharedURLs.put(url, new Integer(count));
                }
            }
        } finally {
            in.close();
        }

        //
        // 3.  Go through each engine and load in its
        // top-10 list.  Check to see if each URL in
        // this set is also present in the "sharedURLs"
        // table.  The ratio of in-top-10 vs in-shared-set
        // is the value we're after for each engine.
        //
        TreeMap engineURLs = new TreeMap(), engineSharedURLs = new TreeMap();
        File resultFiles[] = inputsDir.listFiles();
        for (int i = 0; i < resultFiles.length; i++) {
            String filename = resultFiles[i].getName();
            if (filename.endsWith(QUERY_RESULTS_SUFFIX)) {
                // Compute engine name
                int suffixStart = filename.lastIndexOf(QUERY_RESULTS_SUFFIX);
                String engineName = filename.substring(0, suffixStart);
                
                // Load in results
                in = new DataInputStream(new FileInputStream(resultFiles[i]));
                int engineTopURLs = 0, inSharedSet = 0;
                try {
                    int numQueries = in.readInt();
                    for (int j = 0; j < numQueries; j++) {
                        String query = in.readUTF();
                        int numResults = in.readInt();
                        for (int k = 0; k < numResults; k++) {
                            String result = in.readUTF();

                            engineTopURLs++;
                            if (sharedURLs.containsKey(result)) {
                                inSharedSet++;
                            }
                        }
                    }
                    engineURLs.put(engineName, new Integer(engineTopURLs));
                    engineSharedURLs.put(engineName, new Integer(inSharedSet));
                } finally {
                    in.close();
                }
            }
        }

        //
        // 4.  Output stats
        //
        System.out.println("Engine\t\tIn-shared-set score");
        System.out.println("--------------------------------");
        for (Iterator it = engineURLs.keySet().iterator(); it.hasNext(); ) {
            String engineName = (String) it.next();
            int urlScore = ((Integer) engineURLs.get(engineName)).intValue();
            int sharedScore = ((Integer) engineSharedURLs.get(engineName)).intValue();

            System.out.println(engineName + "\t\t" + sharedScore + " of " + urlScore + "\t(" + ((sharedScore / (1.0 * urlScore)) * 100) + "%)");
        }
    }

    /**
     * Compute numbers that tell us how good the orderings are.
     *
     * Part of this test involves using the MarkovRankSolver to
     * compute a "best group-contribution ranking" that minimizes
     * the overall Kendall Tau distance between the complete
     * ranking and each contributing sublist.
     */
    private void computeOrderingScore() throws IOException {
        //
        // For an engine to say anything about two items,
        // both must be in-set, and at least one must be in
        // the top-10 list.
        //
        // Before we do anything, load in the result lists 
        // and the URL-inset data.
        //
        
        TreeMap engineResults = new TreeMap();
        engineInsetData = new TreeMap();
        File resultFiles[] = inputsDir.listFiles();
        for (int i = 0; i < resultFiles.length; i++) {
            String filename = resultFiles[i].getName();
            if (filename.endsWith(QUERY_RESULTS_SUFFIX)) {
                // Compute engine name
                int suffixStart = filename.lastIndexOf(QUERY_RESULTS_SUFFIX);
                String engineName = filename.substring(0, suffixStart);
                
                // Load in results
                DataInputStream in = new DataInputStream(new FileInputStream(resultFiles[i]));
                try {
                    TreeMap resultLists = new TreeMap();
                    int numQueries = in.readInt();
                    for (int j = 0; j < numQueries; j++) {
                        String query = in.readUTF();
                        int numResults = in.readInt();
                        String resultList[] = new String[numResults];
                        for (int k = 0; k < numResults; k++) {
                            resultList[k] = in.readUTF();
                        }
                        resultLists.put(query, resultList);
                    }
                    engineResults.put(engineName, resultLists);
                } finally {
                    in.close();
                }

                // Next, load in the inset-data
                in = new DataInputStream(new FileInputStream(new File(inputsDir, engineName + URL_INSET_SUFFIX)));
                try {
                    TreeSet insetURLs = new TreeSet();
                    int numItems = in.readInt();
                    for (int j = 0; j < numItems; j++) {
                        String url = in.readUTF();
                        if (in.readBoolean()) {
                            insetURLs.add(url);
                        }
                    }
                    engineInsetData.put(engineName, insetURLs);
                } finally {
                    in.close();
                }
            }
        }

        //
        // We now have two large useful structures.
        //
        // A. engineResults is a Map that maps engine names
        // to another Map.  The value Map maps Queries to
        // String Arrays of results.
        //
        // B. engineInsetData is a Map that maps engine names
        // to a Set.  This Set contains all the relevant URLs
        // that the engine has indexed.
        //
        
        //
        // Figure out all the pairwise statements that
        // come from an engine's top-10 list (not including
        // the ones from position 11 and lower).  
        //


        //
        // Each engine should have identical keys listed in
        // its Map from query terms to Arrays of results.  So
        // just pick the first one from 'engineResults'.
        //
        Map defaultQueryMap = (Map) engineResults.get((String) engineResults.firstKey());
        Map overallDistances = new TreeMap(), bestPageScores = new TreeMap();
        for (Iterator it = engineResults.keySet().iterator(); it.hasNext(); ) {
            String engineName = (String) it.next();
            overallDistances.put(engineName, new Double(0.0));
            bestPageScores.put(engineName, new Double(0.0));
        }

        // Iterate through every query.
        for (Iterator it = defaultQueryMap.keySet().iterator(); it.hasNext(); ) {
            String query = (String) it.next();
            //
            // Go through every engine, finding the results for the query.
            // Build a good full-ordering using the Markov solver
            //
            MarkovRankSolver fullOrdering = new MarkovRankSolver();
            for (Iterator it2 = engineResults.keySet().iterator(); it2.hasNext(); ) {
                String engineName = (String) it2.next();
                Map queryMap = (Map) engineResults.get(engineName);

                // Get results from this engine for the current query
                String results[] = (String[]) queryMap.get(query);
                
                fullOrdering.addOrdering(results);
            }

            fullOrdering.solveRanking();
            int numMarkovStates = fullOrdering.getNumStates();

            //
            // For each engine, compute the DT distance to the full-ordering
            //
            for (Iterator it2 = engineResults.keySet().iterator(); it2.hasNext(); ) {
                String engineName = (String) it2.next();
                Map queryMap = (Map) engineResults.get(engineName);
                String results[] = (String[]) queryMap.get(query);

                // Find how many binary misorderings there are between
                // the results list and the full markov list
                double curDistance = 0.0;
                if (results.length > 1) {
                    curDistance = fullOrdering.getKendallTauDistance(results, true);
                }
                double oldScore = ((Double) overallDistances.get(engineName)).doubleValue();
                overallDistances.put(engineName, new Double(oldScore + curDistance));
            }

            //
            // For each engine, compute the "Best Pages score", which
            // measures how close each engine's top-10 list matches the
            // Markov model's top-10 list.
            //
            for (Iterator it2 = engineResults.keySet().iterator(); it2.hasNext(); ) {
                String engineName = (String) it2.next();
                Map queryMap = (Map) engineResults.get(engineName);
                String results[] = (String[]) queryMap.get(query);

                //
                // Assign a score for each item in our results
                // list.
                double newScore = 0.0;
                for (int i = 0; i < results.length; i++) {
                    int markovPos = fullOrdering.getPos(results[i]);
                    newScore += (numMarkovStates - markovPos);
                }

                double oldScore = ((Double) bestPageScores.get(engineName)).doubleValue();
                bestPageScores.put(engineName, new Double(oldScore + newScore));
            }
        }

        // Emit score to stdout
        System.out.println("Engine\t\tNormalized Kendall Tau Distance");
        System.out.println("--------------------------------");
        for (Iterator it = overallDistances.keySet().iterator(); it.hasNext(); ) {
            String engineName = (String) it.next();
            Double fullDistance = (Double) overallDistances.get(engineName);
            System.out.println(engineName + "\t\t" + fullDistance);
        }

        System.out.println();
        System.out.println();
        System.out.println("Engine\t\t'Best-Page Score'");
        System.out.println("--------------------------------");
        for (Iterator it = bestPageScores.keySet().iterator(); it.hasNext(); ) {
            String engineName = (String) it.next();
            Double score = (Double) bestPageScores.get(engineName);
            System.out.println(engineName + "\t\t" + score);
        }
    }

    /**
     * Take all the file-args we need to compute test results.
     */
    public static void main(String argv[]) throws IOException, ParseException {
        if (argv.length < 2) {
            System.out.println("Usage: java net.nutch.quality.QualityTestTool (-initTest <externalEngineDirectory> <userAgent> <queryList> [-nutchSegment <segmentsDirectory0>] [-nutchSegment <segmentsDirectory1>] ... [-nutchSegmentSet <segmentDir>]) | (-repeatTest <existingWorkDir>) [-coverageConsensus <double>] [-noCoverageTest] [-noOrderingTest]");
            System.out.println();
            System.out.println("Note that 'coverageConsensus' should be a value between 0.0 and 1.0");
            return;
        }

        // vars for parsing command-line options
        File extEngineDescs = null, queryList = null;
        File existingWorkDir = null;
        String userAgent = null;
        Vector nutchSegments = new Vector();
        boolean initTest = false, repeatTest = false;
        boolean testCoverage = true, testOrdering = true;
        double coverageConsensus = 0.0;

        // loop through cmd args
        for (int i = 0; i < argv.length; i++) {
            if ("-initTest".equals(argv[i])) {
                extEngineDescs = new File(argv[i + 1]);
                userAgent = argv[i + 2];
                queryList = new File(argv[i + 3]);
                i += 3;
                initTest = true;
            } else if ("-repeatTest".equals(argv[i])) {
                existingWorkDir = new File(argv[i+1]);
                repeatTest = true;
                i++;
            } else if ("-nutchSegment".equals(argv[i])) {
                nutchSegments.add(new File(argv[i + 1]).getPath());
                i++;
            } else if ("-nutchSegmentSet".equals(argv[i])) {
                File segmentSet = new File(argv[i+1]);
                File segmentSubdirs[] = segmentSet.listFiles();
                for (int j = 0; i < segmentSubdirs.length; j++) {
                    if (segmentSubdirs[j].isDirectory()) {
                        nutchSegments.add(segmentSubdirs[i].getPath());
                    }
                }
                i++;
            } else if ("-coverageConsensus".equals(argv[i])) {
                coverageConsensus = Double.parseDouble(argv[i+1]);
                i++;
            } else if ("-noCoverageTest".equals(argv[i])) {
                testCoverage = false;
            } else if ("-noOrderingTest".equals(argv[i])) {
                testOrdering = false;
            } else {
                System.out.println("Unknown arg: " + argv[i]);
                return;
            }
        }

        // Know what kind of test to run
        if ((initTest && repeatTest) ||
            (! initTest && ! repeatTest)) {
            System.out.println("Must either 'initTest' or 'repeatTest'");
            return;
        }

        // Make sure something's being run
        if (! testCoverage && ! testOrdering) {
            System.out.println("Must run at least one test.");
            return;
        }

        // Build the QTT
        QualityTestTool qtt = null;
        if (initTest) {
            int i = 0;
            String segments[] = new String[nutchSegments.size()];
            for (Enumeration e = nutchSegments.elements(); e.hasMoreElements(); i++) {
                segments[i] = (String) e.nextElement();
            }
            qtt = new QualityTestTool(extEngineDescs, userAgent, segments, queryList);
        } else {
            qtt = new QualityTestTool(existingWorkDir);
        }

        // Kick it off.
        qtt.runTests(testCoverage, testOrdering, coverageConsensus);
    }
}